Required Packages

library(here)
library(tidyverse)
library(minfi)
library(ggplot2)
library(ENmix)
library(stringr)
library(geneplotter)
library(ChAMP)
library(wateRmelon)

Load Data

# setting project and data directories

change_here = function(new_path){
  new_root = here:::.root_env
  new_root$f = function(...){file.path(new_path, ...)}
  assignInNamespace(".root_env", new_root, ns = "here")
}

change_here('/FACT_850K/')
setwd('/FACT_850K/')

sample_info = read.csv(here("FACT_IDs_850k.csv"))

# paths to idats

files_idat_raw = paste0(here(), '/EPIC/', sample_info$Sentrix_ID, '_', sample_info$Sentrix_Position)

# combining with pheno file

pheno = cbind(sample_info, files_idat_raw)
colnames(pheno)[ncol(pheno)] = 'Basename'

Reading idats

RGset = read.metharray.exp(targets=pheno, extended = TRUE)

Internal Quality Control

Produces a PDF QC report for Illumina Infinium Human Methylation 450k arrays, which is useful for identifying failed samples.

qcReport(RGset, pdf="fact_850k_ControlProbes.pdf")

The ControlProbesPlot.pdf file is a 19-page report of the internal quality control information.

Outlier information

A number of functions are run sequentially on the RGset object.First outlier values are thresholded using fixMethOutliers. Then qc is performed using getQC and then sample specific sex is estimated using getSex.

# Creation Of A MethylSet Without Normalization (minifi)

Mset = preprocessRaw(RGset)

QC_minfi = minfiQC(Mset, fixOutliers = TRUE)

# Produce QC diagnostic plots

quartz()
plotQC(QC_minfi$qc)
quartz.save('PotentialOutliers.png', type = 'png', dpi = 200)

Potential Outliers

knitr::include_graphics("PotentialOutliers.png")

Pre Filtering

#Plot quality control plot, mdsplot, densityPlot, dendrogram.

pheno$Cig.Ever = factor(pheno$Cig.Ever)

champ.QC(beta = getBeta(Mset), pheno=pheno$Cig.Ever, mdsPlot=TRUE, densityPlot=TRUE, dendrogram=TRUE, PDFplot=TRUE, Rplot= FALSE,Feature.sel="None", resultsDir=here::here("QC_preFiltering"))

Pre Filtering Density

knitr::include_graphics("QC_preFiltering/raw_densityPlot.png")

Pre Filtering Beta Values

knitr::include_graphics("QC_preFiltering/raw_mdsPlot.png")

Pre Filtering Dendogram

knitr::include_graphics("QC_preFiltering/raw_SampleCluster.png")

Pre-Filtering SVD

Imputing missing beta values before performing SVD

table(is.na(getBeta(Mset)))
# FALSE     TRUE 
# 27737797  955

beta = getBeta(Mset)
betaImpute = champ.impute(beta=beta, pd=pheno, k=5, ProbeCutoff=0.2, SampleCutoff=0.1)

champ.SVD(beta = betaImpute$beta, rgSet=RGset, pd=pheno[,c(8,10,25,29,30,31,32,33,38,46)], RGEffect=TRUE, Rplot=FALSE, resultsDir=here::here("QC_preFiltering"))
knitr::include_graphics("QC_preFilteringSVDsummary.png")

Post Filtering

Extract information for data quanlity controls: detection P values, number of beads and averaged bisulfite conversion intensity. The function can also identify low quality samples and probes, as well as outlier samples based on total intensity or beta value distribution.

qc = ENmix::QCinfo(RGset, distplot=FALSE)

# 0  samples with percentage of low quality CpG value greater than  0.05  or bisulfite intensity less than  8316.313 
# 4806  CpGs with percentage of low quality value greater than  0.05 
# Ploting qc_sample.jpg ...Done
# Ploting qc_CpG.jpg ...Done
# Identifying ourlier samples based on beta or total intensity values...
# After excluding low quality samples and CpGs
# 0  samples are outliers based on averaged total intensity value 
# 0  samples are outliers in beta value distribution 
# 0  outlier samples were added into badsample list


filtered = ChAMP::champ.filter(beta=getBeta(Mset), pd = pheno, beadcount = qc$nbead, detP = qc$detP, arraytype = "EPIC", SampleCutoff=0.05)
              
  # [ Section 1:  Check Input Start ]
  # You have inputed beta for Analysis.

  # pd file provided, checking if it's in accord with Data Matrix...
    # pd file check success.

  # Parameter filterDetP is TRUE, checking if detP in accord with Data Matrix...
    # detP check success.

  # Parameter filterBeads is TRUE, checking if beadcount in accord with Data Matrix...
    # beadcount check success.

  # parameter autoimpute is TRUE. Checking if the conditions are fulfilled...
    # !!! ProbeCutoff is 0, which means you have no needs to do imputation. autoimpute has been reset FALSE.

  # Checking Finished :filterDetP,filterBeads,filterMultiHit,filterSNPs,filterNoCG,filterXY would be done on beta.
  # You also provided :detP,beadcount .
# [ Section 1: Check Input Done ]


# [ Section 2: Filtering Start >>

  # Filtering Detect P value Start
    # The fraction of failed positions per sample
    # You may need to delete samples with high proportion of failed probes:

                    # Failed CpG Fraction.
# 201005010046_R01C01         0.0002318778
# 201005010046_R02C01         0.0002434140
# 201005010046_R03C01         0.0002480285
# 201005010046_R04C01         0.0002041909
# 201005010046_R05C01         0.0002999414
# 201005010046_R06C01         0.0002537966
# 201005010046_R07C01         0.0002791762
# 201005010046_R08C01         0.0002503357
# 200999660113_R01C01         0.0004879816
# 200999660113_R02C01         0.0004591411
# 200999660113_R03C01         0.0006114190
# 200999660113_R04C01         0.0009240502
# 200999660113_R05C01         0.0005733495
# 200999660113_R06C01         0.0006667928
# 200999660113_R07C01         0.0006437204
# 200999660113_R08C01         0.0006610247
# 200932680177_R01C01         0.0002584110
# 200932680177_R02C01         0.0004141499
# 200932680177_R03C01         0.0002422604
# 200932680177_R04C01         0.0003772340
# 200932680177_R05C01         0.0003357036
# 200932680177_R06C01         0.0003553152
# 200932680177_R07C01         0.0003553152
# 200932680177_R08C01         0.0004141499
# 201004220064_R01C01         0.0003968455
# 201004220064_R02C01         0.0003645442
# 201004220064_R03C01         0.0003195529
# 201004220064_R04C01         0.0003333964
# 201004220064_R05C01         0.0004129962
# 201004220064_R06C01         0.0003403181
# 201004220064_R07C01         0.0003287819
# 201004220064_R08C01         0.0003910774

    # Filtering probes with a detection p-value above 0.01.
    # Removing 3338 probes.
    # If a large number of probes have been removed, ChAMP suggests you to identify potentially bad samples

  # Filtering BeadCount Start
    # Filtering probes with a beadcount <3 in at least 5% of samples.
    # Removing 0 probes

  # Filtering NoCG Start
    # Only Keep CpGs, removing 2931 probes from the analysis.

  # Filtering SNPs Start
    # Using general EPIC SNP list for filtering.
    # Filtering probes with SNPs as identified in Zhou's Nucleic Acids Research Paper 2016.
    # Removing 97356 probes from the analysis.

  # Filtering MultiHit Start
    # Filtering probes that align to multiple locations as identified in Nordlund et al
    # Removing 24 probes from the analysis.

  # Filtering XY Start
    # Filtering probes located on X,Y chromosome, removing 16806 probes from the analysis.

  # Updating PD file

  # Fixing Outliers Start
    # Replacing all value smaller/equal to 0 with smallest positive value.
    # Replacing all value greater/equal to 1 with largest value below 1..
# [ Section 2: Filtering Done ]

 # All filterings are Done, now you have 746381 probes and 32 samples.
 
 
pheno = filtered$pd
betas = filtered$beta
  
mvals = filtered$M
filtered_CpG = rownames(betas)
unfiltered_CpG = rownames(getBeta(Mset))
outCpG = unfiltered_CpG[!(unfiltered_CpG %in% filtered_CpG)]
  
champ.QC(beta = betas, pheno=pheno$Cig.Ever, mdsPlot=TRUE, densityPlot=TRUE, dendrogram=TRUE, PDFplot=TRUE, Rplot= FALSE, Feature.sel="None", resultsDir=here::here("QC_postFiltering"))
  
save(qc, filtered, RGset, outCpG, pheno, mvals, betas, file = here("FACT_850k_QC.RData"))

Post-filtering Density

knitr::include_graphics("QC_postFiltering/raw_densityPlot.png")

Post-filtering Beta Values

knitr::include_graphics("QC_postFiltering/raw_mdsPlot.png")

Post-filtering Dendogram

knitr::include_graphics("QC_postFiltering/raw_SampleCluster.png")

Post-Filtering SVD

champ.SVD(beta = betas, rgSet=RGset, pd=pheno[,c(8,10,25,29,30,31,32,33,38,46)], RGEffect=TRUE, Rplot=FALSE, resultsDir=here::here("QC_postFiltering"))
knitr::include_graphics("QC_postFilteringSVDsummary.png")

Probe-level filtering

RGset.noob <- preprocessNoob(RGset)
save(RGset.noob,file="RGsetnoob.RData")

# plot probes density 
densityPlot(RGset, main = "density plots before and after preprocessing", pal="#440154FF", ylim=c(0,4.5))
densityPlot(RGset.noob, add = F, pal = "#FDE725FF")

# Add legend
legend("topleft", c("Noob","Raw"), lty=c(1,1), title="Normalization", bty='n', cex=1.3, col=c("#FDE725FF","#440154FF"))
 
quartz.save('noobDensity.png', type = "png", dpi = 300)
knitr::include_graphics('noobDensity.png')

Estimate cell counts

cell_counts = estimateCellCounts2(RGset, compositeCellType = "Blood", referencePlatform = "IlluminaHumanMethylationEPIC", returnAll = TRUE, meanPlot = FALSE, verbose = TRUE)

write.csv(cell_counts$counts, file = "fact_850k_cellcounts.csv", row.names = F)

save(cell_counts,file = "fact_850k_cell_counts.RData")

# add cell estimates to pheno file
cellprop = cell_counts$counts
cellprop = data.frame(cellprop)
cellprop$Sample_Name = rownames(cellprop)

pheno = merge(pheno, cellprop, by = 'Sample_Name')

ids = colnames(RGset)

pheno = pheno[match(ids, pheno$Sample_Name),]
table(pheno$Sample_Name == colnames(RGset))
# TRUE 
# 32

boxplot(cellprop[,c(1:6)]*100, col=1:ncol(cellprop),xlab="Cell type",ylab="Estimated %",main="Cell type distribution")
quartz.save('cellprop', type = "png", dpi = 300)
knitr::include_graphics('cellprop.png')

Normalization

FunNorm

fun = preprocessFunnorm(RGset)
betas_fun = getBeta(fun)
betas_fun = betas_fun[!rownames(betas_fun) %in% outCpG,]

dim(betas_fun)
# 745578     32
betas_fun = Harman::shiftBetas(betas_fun, shiftBy=1e-4)

mvals_fun <- B2M(betas_fun)

champ.QC(beta = betas_fun, pheno= pheno$Cig.Ever, mdsPlot=TRUE, densityPlot=TRUE, dendrogram=TRUE, PDFplot=TRUE, Rplot= FALSE, Feature.sel="None", resultsDir=here::here("postFunnorm"))

champ.SVD(beta = betas_fun, rgSet=RGset, pd=pheno[,c(8,10,25,29,30,31,32,33,38,46)], RGEffect=TRUE, Rplot=FALSE, resultsDir=here::here("postFunnorm"))

save(betas_fun, mvals_fun, pheno, file = here("fact_850k_funnorm_data.RData"))

Post-normalization Density

knitr::include_graphics("postFunnorm/raw_densityPlot.png")

Post-normalization Beta Values

knitr::include_graphics("postFunnorm/raw_mdsPlot.png")

Post-normalization Dendogram

knitr::include_graphics("postFunnorm/raw_SampleCluster.png")

knitr::include_graphics("postFunnormSVDsummary.png")

Quantile normalizaiton

# quan = preprocessQuantile(RGset, fixOutliers = TRUE, quantileNormalize=TRUE, stratified=TRUE)
# quan = quan[!rownames(quan) %in% outCpG]
# quan = mapToGenome(quan)
# mvals = assays(quan)$M
# betas = M2B(mvals)
# newBetas = Harman::shiftBetas(betas, shiftBy=1e-4)
# # No beta values were found to be 0 or 1. No shifts made.
# mvals_Quantile = B2M(newBetas)
# betas_Quantile = M2B(mvals_Quantile)
# dim(betas_Quantile)
# # 411400     48

# champ.QC(beta = betas_Quantile, pheno= pheno$expGroup, mdsPlot=TRUE,
           # densityPlot=TRUE, dendrogram=TRUE, PDFplot=TRUE, Rplot= FALSE,
           # Feature.sel="None", resultsDir=here::here("postNorm"))

# champ.SVD(beta = betas_Quantile, rgSet=RGset, pd=pheno[-c(1,2,3,4,10,19,20)], RGEffect=TRUE,
            # Rplot=FALSE, resultsDir=here::here("postNorm"))

# save(betas_Quantile, mvals_Quantile, pheno, file = here("fact_quantNorm_data.RData"))

Post-normalization Density

# knitr::include_graphics("postNorm/raw_densityPlot.png")

Post-normalization Beta Values

# knitr::include_graphics("postNorm/raw_mdsPlot.png")

Post-normalization Dendogram

# knitr::include_graphics("postNorm/raw_SampleCluster.png")
# knitr::include_graphics("postNormSVDsummary.png")

Batch correction

# table(pheno$Sample_Name == colnames(mvals_fun))
# # TRUE 
# #  32

# # PCA 
# fact.pca = prcomp(mvals_fun, scale = TRUE)

# pairs(fact.pca$rotation[,1:5], pch = 19, cex = 0.5, col = factor(pheno$Cig.Ever))

# quartz.save('pca_batch.png', type = "png", dpi = 300)
# knitr::include_graphics("pca_batch.png")
# batch = factor(pheno$Batch)
# modcombat = model.matrix(~1, data=pheno)

# Mcombat = ComBat(dat = as.matrix(mvals_fun), batch = batch, mod = modcombat)

# # PCA 
# fact.pca.combat = prcomp(Mcombat, scale = TRUE)

# pairs(fact.pca.combat$rotation[,1:5], pch = 19, cex = 0.5, col = factor(pheno$Batch))

# quartz.save('pca_batch_combat.png', type = "png", dpi = 300)

# save(Mcombat, pheno, file = here("fact_funnorm_combat_data.RData"))
# knitr::include_graphics("pca_batch_combat.png")